packages

#Install and import required packages 
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(leaflet)
library(ggpubr)
## Loading required package: ggplot2
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringdist)
library(stringr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.0
## ✔ readr   2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks stringdist::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpmisc)
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
## 
## Attaching package: 'ggpp'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## Registered S3 method overwritten by 'ggpmisc':
##   method                  from   
##   as.character.polynomial polynom
library(leaflet.extras)
library(ggplot2)
library(leaftime)

Police Violence Data

# Post Rename Variable Names for mpv
# State County date gender race age

mpv <- read.csv("Mapping Police Violence.csv")
mpv <- mpv %>% select(date, state, county, gender, race, age) %>%
  rename(State = state, County = county)
mpv$State <- state.name[match(mpv$State, state.abb)]  #changes state id to name

mpv <- mpv%>%
  mutate(State = ifelse(State == "Virgina", "Virginia", State)) %>%
  mutate(State = ifelse(State == "Massachusetts", "Massachusettes", State))


mpv$date <- substr(mpv$date, nchar(mpv$date) - 3, nchar(mpv$date))
mpv <- mpv %>% filter(date=="2022")

head(mpv)
##   date       State                  County gender  race age
## 1 2022      Kansas                 Johnson   Male White  27
## 2 2022   Louisiana West Baton Rouge Parish Female White  17
## 3 2022  New Jersey                  Morris   Male White  61
## 4 2022   Louisiana West Baton Rouge Parish Female White  15
## 5 2022    Oklahoma                   Tulsa   Male White  27
## 6 2022 Mississippi                 Jackson   Male Black  41

County Population Data

# Post Rename Variable Names for cpop
# State County ESTIMATESBASE2020 POPESTIMATE2020 POPESTIMATE2021 POPESTIMATE2022

cpop <- read.csv("co-est2022-alldata.csv")
cpop<- cpop %>% select(STNAME, CTYNAME, ESTIMATESBASE2020, POPESTIMATE2020, POPESTIMATE2021, POPESTIMATE2022) %>% rename(State = STNAME, County=CTYNAME) %>%
    mutate(State = ifelse(State == "Virgina", "Virginia", State)) %>%
  mutate(State = ifelse(State == "Massachusetts", "Massachusettes", State))

Police Training Data

# Post Rename Variable Names for pt
# State basic_training field_training police_min_age

pt <- read_excel("ICJTR+Basic+Training+Data+Set.xlsx")
colnames(pt)[colnames(pt) == '\"POLICE OFFICER MINIMUM AGE\"'] <- "police min age"
pt <- pt %>% select(STATE, `POLICE BASIC TRAINING HRS 2020`, `POLICE FIELD TRAINING`, `police min age`) %>% rename(State = STATE, basic_training = `POLICE BASIC TRAINING HRS 2020`, field_training =`POLICE FIELD TRAINING`, police_min_age = `police min age`)
pt <- pt%>%
  mutate(State = ifelse(State == "Virgina", "Virginia", State)) %>%
  mutate(State = ifelse(State == "Massachusetts", "Massachusettes", State))

Land Area Data

# Post Rename Variable Names for land_are
# State County SQMI 

land_area <- read.delim("2023_Gaz_counties_national.txt") 
land_area <- land_area %>% select(USPS,NAME,ALAND_SQMI) %>% 
  rename(County = NAME, SQMI = ALAND_SQMI, State = USPS)
land_area$State <- state.name[match(land_area$State, state.abb)] #ST_ID to Name
land_area <- land_area%>%
  mutate(State = ifelse(State == "Virgina", "Virginia", State)) %>%
  mutate(State = ifelse(State == "Massachusetts", "Massachusettes", State))

Merging + New Numerical/Categorical

# Post Merge & Rename Variable Names for pland
# State County SQMI Population Density Denisty_c

# Merge land area data and county population data
pland <- merge(land_area, cpop, by = c('State','County'), 
         all = TRUE) %>% filter(!is.na(SQMI)) 

# New Numerical Value 'Population' as Average of Estimates 2020-2022
pland <- pland %>% 
  mutate(Population = rowMeans(select(
    ., 
    ESTIMATESBASE2020,
    POPESTIMATE2020, 
    POPESTIMATE2021, 
    POPESTIMATE2022), 
    na.rm = TRUE))

#New Numerical Value 'Density' as Density of Population by SQ mi Area
pland <- pland %>% 
  mutate(Density = Population/SQMI)

# New Categorical Value 'Density_c' as Range of population by SQ mi Area
pland <- pland %>%
  mutate(Density_c = cut(Population / SQMI, 
        breaks = c(0, 10, 100, 1000, Inf),
        labels = c("sparse", "low-moderate", "moderate-high", "very high"),
        include.lowest = TRUE)) %>%
  select(State,County,SQMI,Population,Density,Density_c)

#uncomment to check 
head(pland)
##     State         County     SQMI Population   Density     Density_c
## 1 Alabama Autauga County  594.455   59168.25  99.53361  low-moderate
## 2 Alabama Baldwin County 1589.863  237694.00 149.50596 moderate-high
## 3 Alabama Barbour County  885.008   24857.25  28.08703  low-moderate
## 4 Alabama    Bibb County  622.470   22214.50  35.68766  low-moderate
## 5 Alabama  Blount County  644.891   59207.25  91.80970  low-moderate
## 6 Alabama Bullock County  622.815   10285.00  16.51373  low-moderate

Final Merging

# Post Merge & Rename Variable Names for df_final
# State County SQMI Population Density Density_c Victims

#Sorts mpv and counts the number of victims 
mpv2 <- mpv %>%
  count(State, County, name = "Victims")

# Remove "County" from County column in Pland if it exists
# Changes to match naming conventions
pland <- pland %>%
  mutate(County = str_replace(County, " County$", "") %>% str_trim())

# merge to final data set 
df_final <- merge(pland,mpv2, by=c('State','County'),all=TRUE)
# uncomment to check
 head(df_final)
##     State  County     SQMI Population   Density     Density_c Victims
## 1 Alabama Autauga  594.455   59168.25  99.53361  low-moderate      NA
## 2 Alabama Baldwin 1589.863  237694.00 149.50596 moderate-high       2
## 3 Alabama Barbour  885.008   24857.25  28.08703  low-moderate      NA
## 4 Alabama    Bibb  622.470   22214.50  35.68766  low-moderate      NA
## 5 Alabama  Blount  644.891   59207.25  91.80970  low-moderate       1
## 6 Alabama Bullock  622.815   10285.00  16.51373  low-moderate      NA

Summarized data frame for Police Training and Victim Count

# Post Merge & Rename Variable Names for ptmpv
# State basic_training, field_training, police_min_age, Victims

# count deaths by state 
mpv3 <- mpv %>%
  count(State, name = "Victims") 

ptmpv <- merge(pt,mpv3, by='State',all=TRUE) %>% 
  select(State, basic_training, field_training, police_min_age, Victims) %>%
  distinct() %>%
  filter(State != "Washington D.C.*") %>%
  filter(!is.na(police_min_age))
head(ptmpv)
##        State basic_training field_training police_min_age Victims
## 1    Alabama            520              0             19      25
## 2     Alaska            650             40             21       8
## 3    Arizona            585              0             21      58
## 4   Arkansas            520              0             21      15
## 5 California            664            560             18     147
## 6   Colorado            556              0             21      39

Figure 1 : Victim Ratio vs Police Training Hours

# Create data table of final set for temp use
df_final_temp <- df_final %>%
  group_by(State) %>%
  summarise(total_population = sum(Population, na.rm = TRUE)) %>%
  na.omit()

#Add columns for population to victim rate and training total hours
ptmpv <- ptmpv %>% 
  mutate(Victims_TotalPop = Victims/df_final_temp$total_population) %>%
  mutate(TotalTraining = basic_training + as.numeric(field_training)) 

# Graph 
ggplot(ptmpv, aes(x = TotalTraining, y = Victims_TotalPop)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) +
  stat_poly_eq(formula = y ~ x, 
               aes(label = paste(..rr.label.., sep = " ~ ")),
               parse = TRUE,
               size = 4) +  
  labs(title = "Victim Ratio vs Total Police Training Hours",
       x = "Total Training Hours",
       y = "Victims to Total Population Ratio") +
  theme_minimal()
## Warning: The dot-dot notation (`..rr.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(rr.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Figure 2 : Victim Ratio vs Field Training Hours

#Graph 
ggplot(ptmpv, aes(x = as.numeric(field_training), y = Victims_TotalPop)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) +
  stat_poly_eq(formula = y ~ x, 
               aes(label = paste(..rr.label.., sep = " ~ ")),
               parse = TRUE,
               size = 4) +  
  labs(title = "Victim Ratio vs Field Police Training Hours",
       x = "Field Training Hours",
       y = "Victims to Total Population Ratio") +
  theme_minimal()

Figure 3 : Victim Ratio vs Basic Training Hours

#Graph 
ggplot(ptmpv, aes(x = as.numeric(basic_training), y = Victims_TotalPop)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) +
  stat_poly_eq(formula = y ~ x, 
               aes(label = paste(..rr.label.., sep = " ~ ")),
               parse = TRUE,
               size = 4) +  
  labs(title = "Victim Ratio vs Basic Police Training Hours",
       x = "Basic Training Hours",
       y = "Victims to Total Population Ratio") +
  theme_minimal()

Figure 4 : Victim Ratio vs County Density

# Create data table of final set for temp use
df_final_temp <- df_final %>%
  group_by(State) %>% 
  mutate(total_population = sum(Population, na.rm = TRUE)) %>%
  mutate(Victims = coalesce(Victims, 0))

# Graph 
ggplot(df_final_temp, aes(x = Density,
  y = Victims/df_final_temp$total_population)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) +
  stat_poly_eq(formula = y ~ x, 
               aes(label = paste(..rr.label.., sep = " ~ ")),
               parse = TRUE,
               size = 4) +  
  labs(title = "Victim Ratio vs US County Density",
       x = "Density per sq mi",
       y = "Victims to Total Population Ratio") +
  theme_minimal()
## Warning: Removed 117 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 117 rows containing non-finite values (`stat_poly_eq()`).
## Warning: Removed 117 rows containing missing values (`geom_point()`).

Figure 5 Victim Ratio and County density

df_final_temp <- df_final %>%
  group_by(State, Density_c) %>%  # Group by both State and Density
  summarise(total_population = sum(Population, na.rm = TRUE),
            Victims = sum(coalesce(Victims, 0)))  %>%
  filter(!is.na(Density_c))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
# Graph with geom_bar
ggplot(df_final_temp, aes(x = Density_c,
  y = (Victims/total_population)*100000,
  fill=factor(Density_c))) +  
  geom_bar(stat = "identity") +
  labs(title = "Victim Ratio and US County Density Category",
       x = "Density Category",
       y = "Victims to per 100,000 population",
       fill = "Desnity Category") +
  theme_minimal()

Figure 6 : Washington Victims vs County Density

# Create data table of final set for temp use
df_final_temp <- df_final %>%
  filter(State == "Washington") %>% #Change State here
  mutate(Victims = coalesce(Victims, 0))

# Graph 
ggplot(df_final_temp, aes(x = Density,
  y = Victims/Population*10000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) +
  stat_poly_eq(formula = y ~ x, 
               aes(label = paste(..rr.label.., sep = " ~ ")),
               parse = TRUE,
               size = 4) +  
  labs(title = "Victim Ratio vs Washington's County Density",
       x = "Density per sq mi",
       y = "Victims per 10,000",
       color = "") +
  theme_minimal()
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing non-finite values (`stat_poly_eq()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Figure 7 : State Visualization

# Replace With STATE county Shapefile Folder
counties <- st_read("SHOREWA/SHOREWA")
## Reading layer `Washington_Counties_with_Natural_Shoreline___washsh_area' from data source `/Users/chsu/RStudio/info-201-finalproject-main 3/SHOREWA/SHOREWA' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 39 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -124.7331 ymin: 45.54373 xmax: -116.9152 ymax: 49.00253
## Geodetic CRS:  WGS 84
df_final_temp <- df_final %>%
  filter(State == "Washington") %>%
  rename(COUNTY = County) %>%
  group_by(COUNTY) %>%
  select(COUNTY, Population, Density)

mpv_temp <- mpv %>%
  filter(State == "Washington") %>%
  rename(COUNTY = County) %>%
  group_by(COUNTY) %>%
  summarise(Victims = n()) %>%
  distinct() %>%
  ungroup()

mpv_temp <- merge(df_final_temp,mpv_temp, by="COUNTY", all=TRUE) 
mpv_temp$COUNTY <- toupper(mpv_temp$COUNTY)

mpv_temp <- mpv_temp %>%
  mutate(VictimRatio = Victims/Population * 1000)

map_data <- merge(counties, mpv_temp, by.x = "COUNTY", by.y = "COUNTY", all = FALSE)
color_scale <- scale_fill_gradient(
  low = "beige",
  high = "red",
  na.value = "grey"  
)

ggplot() +
  geom_sf(data = map_data, aes(fill = VictimRatio), color = "white", size = 0.2) +
  color_scale +
  labs(title = "Victims per 1000 by County in Washington State in 2022",
       fill = "Victims per 1000",
       x="Grey indicates missing data or 0 deaths")

National Visualization per 2022

# Replace With STATE county Shapefile Folder
counties <- st_read("USAshapefile/USAshapefile")
## Reading layer `USA_Counties_(Generalized)' from data source 
##   `/Users/chsu/RStudio/info-201-finalproject-main 3/USAshapefile/USAshapefile' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3142 features and 60 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -178.2176 ymin: 18.92179 xmax: -66.96927 ymax: 71.40624
## Geodetic CRS:  WGS 84
df_final_temp <- df_final %>%
  rename(NAME = County) %>%
  group_by(NAME) %>%
  select(NAME, Population, Density)

mpv_temp <- mpv %>%
  rename(NAME = County) %>%
  group_by(NAME) %>%
  summarise(Victims = n()) %>%
  distinct() %>%
  ungroup()
#Dont change anything above this 

mpv_temp <- merge(df_final_temp,mpv_temp, by="NAME", all=TRUE) 
mpv_temp$NAME <- toupper(mpv_temp$NAME)
counties$NAME <- toupper(counties$NAME)
counties <- counties %>% filter(STATE_NAME!="Alaska")
mpv_temp <- mpv_temp %>%
  mutate(VictimRatio = Victims/Population * 1000)

map_data <- merge(counties, mpv_temp, by.x = "NAME", by.y = "NAME", all = FALSE)
map_data <- map_data %>%
  mutate(VictimRatio = ifelse((VictimRatio==0), NA , VictimRatio))
 
leaflet(data = map_data) %>%
  addProviderTiles("OpenStreetMap.Mapnik") %>%
  addPolygons(
    data = map_data,
    fillColor = ~colorQuantile("YlOrRd", map_data$VictimRatio, n = 10)(map_data$VictimRatio),,
    fillOpacity = 0.7,
    color = "lightblue",
    weight = 1,
    label = ~paste(NAME, "<br>", "Victims per 1000:", formatC(VictimRatio, format = "f", digits = 1))
  ) %>%
  addScaleBar(position = "bottomleft") %>%
  addMiniMap(position = "bottomleft") %>%
  addControl("zoomSlider", position = "topright") %>%
  addFullscreenControl()
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette YlOrRd is 9
## Returning the palette you asked for with that many colors